In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import grid_search
from sklearn import metrics
from sklearn import cross_validation
from sklearn.externals import joblib
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import itertools
import random
import os
import pickle
import time
In [2]:
DATA_DIRECTORY = "E:\\eaglesense\\data\\topviewkinect"
PREPROCESSED_DIRECTORY = DATA_DIRECTORY + "\\all"
FEATURE_SET = "eval-chi2"
In [3]:
if not os.path.exists("results"):
os.makedirs("results")
In [4]:
features_csv = "{root}/{tag}_features.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
features_df = pd.read_csv(features_csv)
In [5]:
features_df.head()
Out[5]:
In [6]:
labels_csv = "{root}/{tag}_labels.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
labels_df = pd.read_csv(labels_csv)
In [7]:
s1_data_path = "{root}/{tag}_s1_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
s2_data_path = "{root}/{tag}_s2_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
cs_data_path = "{root}/{tag}_cs_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
noinfrared_data_path = "{root}/{tag}_cs_noinfrared_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
with open(s1_data_path, "rb") as f:
s1_data = pickle.load(f)
with open(s2_data_path, "rb") as f:
s2_data = pickle.load(f)
with open(cs_data_path, "rb") as f:
cs_data = pickle.load(f)
with open(noinfrared_data_path, "rb") as f:
noinfrared_data = pickle.load(f)
In [8]:
unique_subjects = features_df["subject"].unique()
unique_subjects
Out[8]:
In [9]:
ACTIVITIES = ["Standing", "Sitting", "Pointing", "Phone", "Tablet", "Paper"]
In [10]:
num_activities = len(ACTIVITIES)
num_activities
Out[10]:
In [11]:
XGB_PARAM_FINAL = {}
XGB_PARAM_FINAL["eta"] = 0.3
XGB_PARAM_FINAL["gamma"] = 1
XGB_PARAM_FINAL["lambda"] = 1
XGB_PARAM_FINAL["alpha"] = 0
XGB_PARAM_FINAL["max_depth"] = 6
XGB_PARAM_FINAL["colsample_bytree"] = 0.5
XGB_PARAM_FINAL["colsample_bylevel"] = 0.5
XGB_PARAM_FINAL["subsample"] = 0.5
XGB_PARAM_FINAL["objective"] = "multi:softmax"
XGB_PARAM_FINAL["eval_metric"] = "merror"
XGB_PARAM_FINAL["num_class"] = len(ACTIVITIES)
XGB_PARAM_FINAL["silent"] = 0
XGB_NUM_ROUNDS = 200
XGB_EARLYSTOPPING_ROUNDS = 30
In [12]:
def crosssubject_test_split(features_df, labels_df, training_subjects_ids):
num_features = features_df.shape[1] - 1
X_train = np.array([], dtype=np.float64).reshape(0, num_features)
y_train = np.array([], dtype=np.int32).reshape(0, 1)
X_test = np.array([], dtype=np.float64).reshape(0, num_features)
y_test = np.array([], dtype=np.int32).reshape(0, 1)
for subject_id in unique_subjects:
subject_features = features_df[features_df["subject"] == subject_id]
subject_features = subject_features.drop(["subject"], axis=1)
subject_labels = labels_df[labels_df["subject"] == subject_id]
subject_labels = subject_labels[["activity"]]
subject_X = subject_features.values
subject_y = subject_labels.values
if subject_id in training_subjects_ids:
X_train = np.vstack([X_train, subject_X])
y_train = np.vstack([y_train, subject_y])
else:
X_test = np.vstack([X_test, subject_X])
y_test = np.vstack([y_test, subject_y])
return X_train, y_train, X_test, y_test
In [13]:
def get_normalized_confusion_matrix(y_true, y_predicted):
confusion_matrix = metrics.confusion_matrix(y_true, y_predicted)
confusion_matrix_normalized = confusion_matrix.astype("float") / confusion_matrix.sum(axis=1)[:, np.newaxis]
confusion_matrix_normalized *= 100
return confusion_matrix_normalized
In [14]:
s1_X_train = s1_data["X_train"]
s1_y_train = s1_data["y_train"]
s1_X_test = s1_data["X_test"]
s1_y_test = s1_data["y_test"]
In [15]:
s1_X_train.shape
Out[15]:
In [16]:
s1_X_test.shape
Out[16]:
In [17]:
s1_train_xgbmatrix = xgb.DMatrix(s1_X_train, s1_y_train)
s1_test_xgbmatrix = xgb.DMatrix(s1_X_test, s1_y_test)
s1_watchlist = [(s1_train_xgbmatrix, "train"), (s1_test_xgbmatrix, "eval")]
In [18]:
s1_eval_results = {}
s1_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=s1_train_xgbmatrix, evals=s1_watchlist, evals_result=s1_eval_results,
num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
In [19]:
s1_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=s1_train_xgbmatrix, num_boost_round=s1_validation.best_iteration+1)
In [20]:
s1_y_predicted = s1_booster.predict(s1_test_xgbmatrix)
In [21]:
s1_accuracy = metrics.accuracy_score(s1_y_test, s1_y_predicted)
s1_accuracy
Out[21]:
In [22]:
s1_confusion_matrix = get_normalized_confusion_matrix(s1_y_test, s1_y_predicted)
In [23]:
s1_results_dump = {
"eval_results": s1_eval_results,
"eval_earlystoppping_best_iteration": s1_validation.best_iteration+1,
"eval_earlystoppping_best_score": s1_validation.best_score,
"classifier": s1_booster,
"final_accuracy": s1_accuracy,
"final_confusion_matrix": s1_confusion_matrix
}
with open("results/s1.pickle", "wb") as f:
pickle.dump(s1_results_dump, f)
In [24]:
s2_X_train = s2_data["X_train"]
s2_y_train = s2_data["y_train"]
s2_X_test = s2_data["X_test"]
s2_y_test = s2_data["y_test"]
In [25]:
s2_X_train.shape
Out[25]:
In [26]:
s2_X_test.shape
Out[26]:
In [27]:
s2_train_xgbmatrix = xgb.DMatrix(s2_X_train, s2_y_train)
s2_test_xgbmatrix = xgb.DMatrix(s2_X_test, s2_y_test)
s2_watchlist = [(s2_train_xgbmatrix, "train"), (s2_test_xgbmatrix, "eval")]
In [28]:
s2_eval_results = {}
s2_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=s2_train_xgbmatrix, evals=s2_watchlist, evals_result=s2_eval_results,
num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
In [29]:
s2_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=s2_train_xgbmatrix, num_boost_round=s2_validation.best_iteration+1)
In [30]:
s2_y_predicted = s2_booster.predict(s2_test_xgbmatrix)
In [31]:
s2_accuracy = metrics.accuracy_score(s2_y_test, s2_y_predicted)
s2_accuracy
Out[31]:
In [32]:
s2_confusion_matrix = get_normalized_confusion_matrix(s2_y_test, s2_y_predicted)
In [33]:
s2_results_dump = {
"eval_results": s2_eval_results,
"eval_earlystoppping_best_iteration": s2_validation.best_iteration+1,
"eval_earlystoppping_best_score": s2_validation.best_score,
"classifier": s2_booster,
"final_accuracy": s2_accuracy,
"final_confusion_matrix": s2_confusion_matrix
}
with open("results/s2.pickle", "wb") as f:
pickle.dump(s2_results_dump, f)
In [14]:
cs_X_train = cs_data["X_train"]
cs_y_train = cs_data["y_train"]
cs_X_test = cs_data["X_test"]
cs_y_test = cs_data["y_test"]
In [16]:
cs_X_train.shape
Out[16]:
In [17]:
cs_X_test.shape
Out[17]:
In [37]:
from sklearn import ensemble
In [38]:
rf_clf = ensemble.RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=None, max_features="sqrt",
random_state=42, n_jobs=-1)
In [39]:
rf_training_start = time.time()
rf_clf.fit(cs_X_train, cs_y_train.ravel())
rf_training_time = (time.time() - rf_training_start)
rf_training_time
Out[39]:
In [40]:
rf_testing_start = time.time()
rf_y_predicted = rf_clf.predict(cs_X_test)
rf_testing_time = (time.time() - rf_testing_start)
rf_testing_time
Out[40]:
In [41]:
rf_y_train_predicted = rf_clf.predict(cs_X_train)
rf_train_accuracy = metrics.accuracy_score(cs_y_train, rf_y_train_predicted)
rf_train_accuracy
Out[41]:
In [42]:
rf_accuracy = metrics.accuracy_score(cs_y_test, rf_y_predicted)
rf_accuracy
Out[42]:
In [43]:
rf_confusion_matrix = get_normalized_confusion_matrix(cs_y_test, rf_y_predicted)
In [44]:
rf_results_dump = {
"training_time": rf_training_time,
"testing_time": rf_testing_time,
"training_accuracy": rf_train_accuracy,
"final_accuracy": rf_accuracy,
"final_confusion_matrix": rf_confusion_matrix
}
with open("results/cs_rf.pickle", "wb") as f:
pickle.dump(rf_results_dump, f)
In [53]:
cs_X_train.shape
Out[53]:
In [54]:
cs_X_test.shape
Out[54]:
In [20]:
cs_train_xgbmatrix = xgb.DMatrix(cs_X_train, cs_y_train)
cs_test_xgbmatrix = xgb.DMatrix(cs_X_test, cs_y_test)
cs_watchlist = [(cs_train_xgbmatrix, "train"), (cs_test_xgbmatrix, "eval")]
In [21]:
cs_eval_results = {}
cs_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=cs_train_xgbmatrix, evals=cs_watchlist, evals_result=cs_eval_results,
num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
In [22]:
xgboost_training_start = time.time()
cs_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=cs_train_xgbmatrix, num_boost_round=cs_validation.best_iteration+1)
xgboost_training_time = (time.time() - xgboost_training_start)
xgboost_training_time
Out[22]:
In [55]:
total_time = list()
In [ ]:
# training
for i in range(cs_X_train.shape[0]):
x = cs_X_train[i,:]
x = x.reshape((1, 72))
x_dmatrix = xgb.DMatrix(x)
start = time.time()
cs_booster.predict(x_dmatrix)
total_time.append(time.time() - start)
# testing
for i in range(cs_X_test.shape[0]):
x = cs_X_test[i,:]
x = x.reshape((1, 72))
x_dmatrix = xgb.DMatrix(x)
start = time.time()
cs_booster.predict(x_dmatrix)
total_time.append(time.time() - start)
In [ ]:
avg_time = np.mean(total_time)
In [ ]:
avg_time * 1000
In [ ]:
std_time = np.std(total_time)
In [ ]:
std_time * 1000
In [50]:
xgboost_testing_start = time.time()
cs_y_predicted = cs_booster.predict(cs_test_xgbmatrix)
xgboost_testing_time = (time.time() - xgboost_testing_start)
xgboost_testing_time
Out[50]:
In [51]:
cs_y_train_predicted = rf_clf.predict(cs_X_train)
cs_train_accuracy = metrics.accuracy_score(cs_y_train, cs_y_train_predicted)
cs_train_accuracy
Out[51]:
In [52]:
cs_accuracy = metrics.accuracy_score(cs_y_test, cs_y_predicted)
cs_accuracy
Out[52]:
In [53]:
cs_confusion_matrix = get_normalized_confusion_matrix(cs_y_test, cs_y_predicted)
In [54]:
cs_confusion_matrix_subjects = list()
for subject_id in unique_subjects:
subject_features = features_df[features_df["subject"] == subject_id]
subject_features = subject_features.drop(["subject"], axis=1)
subject_labels = labels_df[labels_df["subject"] == subject_id]
subject_labels = subject_labels[["activity"]]
subject_X = subject_features.values
subject_y = subject_labels.values
subject_xgbmatrix = xgb.DMatrix(subject_X, subject_y)
subject_y_predicted = cs_booster.predict(subject_xgbmatrix)
subject_accuracy = metrics.accuracy_score(subject_y, subject_y_predicted)
subject_confusion_matrix = get_normalized_confusion_matrix(subject_y, subject_y_predicted)
cs_confusion_matrix_subjects.append((subject_id, subject_accuracy, subject_confusion_matrix))
In [55]:
for activity_idx, activity in enumerate(ACTIVITIES):
activity_accuracy = cs_confusion_matrix[activity_idx, activity_idx]
activity_error = 100 - activity_accuracy
print(activity, "\tAccuracy:", activity_accuracy, "\tError:", activity_error)
In [56]:
cs_results_dump = {
"training_time": xgboost_training_time,
"testing_time": xgboost_testing_time,
"eval_results": cs_eval_results,
"eval_earlystoppping_best_iteration": cs_validation.best_iteration+1,
"eval_earlystoppping_best_score": cs_validation.best_score,
"classifier": cs_booster,
"training_accuracy": cs_train_accuracy,
"final_accuracy": cs_accuracy,
"final_confusion_matrix": cs_confusion_matrix,
"subject_confusion_matrix": cs_confusion_matrix_subjects
}
with open("results/cs.pickle", "wb") as f:
pickle.dump(cs_results_dump, f)
In [57]:
noinfrared_X_train = noinfrared_data["X_train"]
noinfrared_y_train = noinfrared_data["y_train"]
noinfrared_X_test = noinfrared_data["X_test"]
noinfrared_y_test = noinfrared_data["y_test"]
In [58]:
noinfrared_X_train.shape
Out[58]:
In [59]:
noinfrared_X_test.shape
Out[59]:
In [60]:
noinfrared_train_xgbmatrix = xgb.DMatrix(noinfrared_X_train, noinfrared_y_train)
noinfrared_test_xgbmatrix = xgb.DMatrix(noinfrared_X_test, noinfrared_y_test)
noinfrared_watchlist = [(noinfrared_train_xgbmatrix, "train"), (noinfrared_test_xgbmatrix, "eval")]
In [61]:
noinfrared_eval_results = {}
noinfrared_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=noinfrared_train_xgbmatrix, evals=noinfrared_watchlist,
evals_result=noinfrared_eval_results, num_boost_round=XGB_NUM_ROUNDS,
early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
In [62]:
noinfrared_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=noinfrared_train_xgbmatrix,
num_boost_round=noinfrared_validation.best_iteration+1)
In [63]:
noinfrared_y_predicted = noinfrared_booster.predict(noinfrared_test_xgbmatrix)
In [64]:
noinfrared_accuracy = metrics.accuracy_score(noinfrared_y_test, noinfrared_y_predicted)
noinfrared_accuracy
Out[64]:
In [65]:
noinfrared_confusion_matrix = get_normalized_confusion_matrix(noinfrared_y_test, noinfrared_y_predicted)
In [66]:
noninfrared_results_dump = {
"eval_results": noinfrared_eval_results,
"eval_earlystoppping_best_iteration": noinfrared_validation.best_iteration+1,
"eval_earlystoppping_best_score": noinfrared_validation.best_score,
"classifier": noinfrared_booster,
"final_accuracy": noinfrared_accuracy,
"final_confusion_matrix": noinfrared_confusion_matrix,
}
with open("results/cs_noinfrared.pickle", "wb") as f:
pickle.dump(noninfrared_results_dump, f)
In [67]:
cs_combinations = list(itertools.combinations(unique_subjects, int(len(unique_subjects)/2)))
len(cs_combinations)
Out[67]:
In [68]:
cs_combinations_results_csv = "results/cs_combinations.csv"
In [69]:
open(cs_combinations_results_csv, "w").close()
with open(cs_combinations_results_csv, "a") as f:
data_columns = pd.DataFrame(columns=["combination", "activity", "a1", "a2", "a3", "a4", "a5", "a6"])
data_columns.to_csv(f, header=True, index=False)
In [70]:
for cs_combination_idx, cs_combination in enumerate(cs_combinations):
print(cs_combination_idx, "... ", end="")
# Get data
combination_X_train, combination_y_train, combination_X_test, combination_y_test = crosssubject_test_split(
features_df, labels_df, cs_combination)
combination_train_xgbmatrix = xgb.DMatrix(combination_X_train, combination_y_train)
combination_test_xgbmatrix = xgb.DMatrix(combination_X_test, combination_y_test)
# Train
combination_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=combination_train_xgbmatrix, num_boost_round=cs_validation.best_iteration+1)
combination_y_predicted = combination_booster.predict(combination_test_xgbmatrix)
# Get results
combination_results = metrics.confusion_matrix(combination_y_test, combination_y_predicted)
combination_results_df = pd.DataFrame(columns=["combination", "activity", "a1", "a2", "a3", "a4", "a5", "a6"])
for activity_id, activity in enumerate(ACTIVITIES):
combination_results_df.loc[activity_id] = [
cs_combination_idx, activity,
combination_results[activity_id,0], combination_results[activity_id,1], combination_results[activity_id,2],
combination_results[activity_id,3], combination_results[activity_id,4], combination_results[activity_id,5]
]
# Append results
with open(cs_combinations_results_csv, "a") as f:
combination_results_df.to_csv(f, header=False, index=False)
In [71]:
combinations_results_df = pd.read_csv(cs_combinations_results_csv)
In [72]:
combinations_confusion_matrix = np.zeros((num_activities, num_activities))
for activity_idx, activity in enumerate(ACTIVITIES):
combinations_activity_results = combinations_results_df[combinations_results_df["activity"] == activity]
for accuracy_idx, accuracy_column in enumerate(["a1", "a2", "a3", "a4", "a5", "a6"]):
combinations_confusion_matrix[activity_idx, accuracy_idx] = combinations_activity_results[accuracy_column].sum()
combinations_confusion_matrix_normalized = combinations_confusion_matrix.astype("float") / combinations_confusion_matrix.sum(axis=1)[:, np.newaxis]
combinations_confusion_matrix_normalized *= 100
In [73]:
all_samples = np.sum(combinations_confusion_matrix)
In [74]:
accurate_samples = 0
for activity_id in range(len(ACTIVITIES)):
accurate_samples += combinations_confusion_matrix[activity_id, activity_id]
In [75]:
combinations_accuracy = accurate_samples / all_samples
combinations_accuracy
Out[75]:
In [76]:
combinations_results_dump = {
"accuracy": combinations_accuracy,
"confusion_matrix": combinations_confusion_matrix_normalized,
}
In [77]:
with open("results/cs_combinations.pickle", "wb") as f:
pickle.dump(combinations_results_dump, f)
In [78]:
# X.shape
In [79]:
# y.shape
In [80]:
# demo_train_xgbmatrix = xgb.DMatrix(X, y)
# demo_test_xgbmatrix = xgb.DMatrix(X, y)
# demo_watchlist = [(demo_train_xgbmatrix, "train"), (demo_test_xgbmatrix, "eval")]
In [81]:
# demo_results = {}
# demo_booster = xgb.train(XGB_PARAM_DEMO, demo_train_xgbmatrix, XGB_NUM_ROUNDS_DEMO, demo_watchlist, evals_result=demo_results, early_stopping_rounds=20)
In [82]:
# demo_booster.save_model("demo-xgboost.model")
In [83]:
# bst2 = xgb.Booster(model_file="demo-xgboost.model")
In [84]:
# test_dmatrix = xgb.DMatrix(X)
# y_predicted = bst2.predict(test_dmatrix)
# accuracy = metrics.accuracy_score(y, y_predicted)
In [85]:
# accuracy